By:
Heba El-Shimy
PhD Scholar and Teaching Assistant
Heriot-Watt University, Dubai Campus
Based on content from Python Lecture 1
Heriot-Watt University, Edinburgh Campus
Author: Daniel Kienitz
conda update conda
conda create -n yourenvname anaconda
conda activate yourenvname
Note: use
deactivateto deactivate your virtual environment.
conda install -c anaconda -n yourenvname seaborn
conda install -c menpo -n yourenvname opencv3
jupyter notebook
A Jupyter notebook server will start and will open in a new browser, if not, in your CMD or Terminal Jupyter will have generated a link for you with a security token; copy and paste that into your browser.
In the Jupyter webpage, click New --> Python 3 under notebooks to create a new notebook.
The Jupyter Notebook is an open-source web application that allows you to create and share documents that contain live code, equations, visualizations and narrative text.
Cells expect code as input by default, but you can change the cell Format to Markdown from the toolbar above to write pieces of text like this one.
To edit a cell's content, activate that cell by clicking on it, or using your keyboard's ↑ or ↓ arrows to move through the cells. Once you reach the cell you need to edit, hit ↵.
To run some cell's content (code or markdown for text), hit "Shift + ↵". Or press "Run" from the toolbar above.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import statsmodels.api as sm
import re
# Reading a CSV file as dataframe and saving it into a variable
# Make the column separator the semicolon character
mtcars = pd.read_csv('mtcars.csv', sep=';')
# Print the dataframe (or part of it if its too long/wide)
mtcars
# Read the csv file, assign the separator charater to semicolon, identfy colons as the decimal character
# no NaN values, assign first column as the dataframe index
mtcars = pd.read_csv('mtcars.csv', sep=';', decimal=',', na_values='None', index_col=0)
mtcars
# Get the dataframe dimensions (rows X columns)
mtcars.shape
# Get the first 5 rows in the dataframe
mtcars.head(5)
# Get the last 5 rows in the dataframe
mtcars.tail(5)
# Using Pandas built-in methods to perform operations on the dataframe
# .index() method returns a Pandas series
# .tolist() converts the Pandas series to a Python list
mtcars.index.tolist()
# Type of each column
# Object type refers to strings and can support string operations
mtcars.dtypes
# Print all columns
mtcars.columns
# Selecting a column in the dataframe (using column name as string)
mtcars['disp']
# Selecting a column in the dataframe (using column position as an integer)
mtcars.iloc[:, 2]
# Selecting a row in the dataframe (using index name as string)
mtcars.loc['Ford Pantera L', :]
# Selecting a row in the dataframe (using row position as integer)
mtcars.iloc[28, :]
# Test your skills
# TODO 1: Select a single cell in a certain row and column
# TODO 2: Select multiple columns
# Define a function to convert mpg to liter/100km
def lp100(mpg_val):
return 100 * 3.78 / (1.6 * mpg_val)
# Adding a new column to the dataframe
# Select the 'mpg' column and use the apply method
# which takes a function and applies it to each value in a Pandas Series
mtcars['lp100km'] = mtcars['mpg'].apply(func=lp100)
mtcars
# Another way of doing the previous operation is using lambda functions
# applying a non-previously defined function to each item of a Series
mtcars2 = mtcars.iloc[:, :-1]
mtcars2['lp100km'] = mtcars2['mpg'].apply(lambda x: 100 * 3.78 / (1.6 * x))
mtcars2
# A third way to add columns to a dataframe using Pandas built-in functions
mtcars3 = mtcars.iloc[:, :-1]
mtcars3.insert(len(mtcars.columns.tolist())-1, 'lp100km', mtcars3['mpg'].apply(lambda x: 100 * 3.78 / (1.6 * x)))
mtcars3
# Creating a boolean mask
remove_bools = [bool(re.search(pattern='lp100', string=col_name)) for col_name in mtcars.columns]
remove_bools
# Another way of performing the previous operation using Pandas built-in methods
remove_bools_pd = mtcars2.columns.str.contains('lp100').tolist()
remove_bools_pd
# Dropping columns from a dataframe
# Using the boolean mast we created, where the column that has True as a value will be dropped
# axis defines whether what we're going to drop is a row (axis=0) or column (axis=1)
# Make sure you assign the result of running this command to the variable that holds the new dataframe
mtcars = mtcars.drop(labels=mtcars.columns[remove_bools].tolist(), axis=1)
mtcars
# Reset index (useful in cases of subsetting the data)
mtcars4 = mtcars.reset_index(drop=False)
mtcars4
# Pipes to apply a series of operations on the dataframe (prone to error)
# might be useful for batch preprocessing data using a for loop
mtcars.pipe(func_1).pipe(lambda).pipe(func, arg1, arg2)
# Read an image (into a numpy array)
image = cv2.imread('maserati_bora.jpg')
image
# Viewing the image
plt.figure(figsize=(15, 20))
plt.imshow(image)
# Colored images are read as RGB, OpenCV loads them as BGR (Blue, Green, Red)
# Images read as numpy arrays are 3-dimensional (height/rows of pixels, width/columns of pixels, color channels)
image.shape
# Convert from OpenCV's BGR to RGB
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Show the image
plt.figure(figsize=(15, 20))
plt.imshow(image_rgb)
## Test your skills
# TODO 1: How many color channels in a grayscale image?
# TODO 2: How to transform a colored image into grayscale image?
# Resizing an image
image_resize = cv2.resize(image, (48, 48))
image_resize.shape
# Transform 3D image numpy array into raw vector of 1 row and whatever number of columns that
# can hold the data
image_resize.reshape((1, -1))
image_resize.shape
# Read an image (into a numpy array)
image2 = cv2.imread('s_klasse.jpg')
image2
# Viewing the image
plt.figure(figsize=(15, 20))
plt.imshow(image2)
# Convert from OpenCV's BGR to RGB
image2_grayscale = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
image2_grayscale.shape
plt.figure(figsize=(15, 20))
plt.imshow(image2_grayscale)
plt.imshow(image2_grayscale, cmap='Blues')
plt.imshow(image2_grayscale, cmap='Greys')
# Steps similar to creating the dataset you have for the coursework
image2_resize = cv2.resize(image2, (48, 48)).reshape((1, -1))
dataset = np.vstack([image_resize, image2_resize])
dataset
dataset.shape
# Bar plot for numbers of cars in each mpg range (calculated automatically based on the xticklabels steps)
with sns.axes_style('white'):
g = sns.factorplot("mpg", data=mtcars, aspect=3,
kind="count", color='steelblue')
g.set_xticklabels(step=2)
# Doing simple linear regression with density estimation for mpg vs cylinders
sns.jointplot("mpg", "cyl", data=mtcars, kind='reg')
from sklearn.tree import DecisionTreeClassifier, plot_tree
# Create labels for our dataset
labels = np.zeros(shape=(mtcars.shape[0], 1))
labels
labels.shape
# We need to add 1 as label for Mercedes as car make and 0 for everything else
# Using a boolean mask
mtcars4['index'].str.contains('Merc')
# Set the labels according to the mask
labels[mtcars4['index'].str.contains('Merc')] = 1
labels
# Drop the index column as it conatins label information in text format which is not useful
# and as it's the target it should be separate from the training data
mtcars_num = mtcars4.drop(['lp100km'], axis=1).select_dtypes(include=['float64', 'int64'])
mtcars_num
# Instantiate a decision tree class from Scikit-Learn
dec_tree = DecisionTreeClassifier()
# Fit the algorithm to our data
dec_tree.fit(X=mtcars_num, y=labels)
# Calculate the accuracy
dec_tree.score(X=mtcars_num, y=labels)
# Plot the decision tree
# Brightness of the colors represents the confidence of the decision tree
# Gini coefficient is 0 when no further splits can be done
# Left is True and right is false
plot_tree(decision_tree=dec_tree, label=['Not Merc', 'Merc'], filled=True, feature_names=mtcars_num.columns.tolist())
# Simple regressional model
reg_model = sm.OLS(endog=mtcars['hp'], exog=mtcars['mpg']).fit()
reg_model.summary()